In [ ]:
import numpy as np 
import pandas as pd

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import seaborn as sns
In [ ]:
data = pd.read_csv("ai4i2020.csv")
data.replace("?",np.nan,inplace=True)
for column in data.columns:
    try:
        data[column] = data[column].astype(float)
    except:
        pass
data.drop(['UDI', 'Product ID'],axis=1,inplace=True)
data['Machine failure']=0
data['Machine failure'][data['TWF']==1]=1
data['Machine failure'][data['HDF']==1]=2
data['Machine failure'][data['PWF']==1]=3
data['Machine failure'][data['OSF']==1]=4
data['Machine failure'][data['RNF']==1]=5
data.drop(['TWF','HDF','PWF','OSF','RNF'],axis=1,inplace=True)
data['Power'] = data['Rotational speed [rpm]'] * data['Torque [Nm]']
data['Temperature difference'] = data['Process temperature [K]'] - data['Air temperature [K]']
data = data[[
    'Machine failure',
    'Type',
    'Air temperature [K]',
    'Process temperature [K]',
    'Rotational speed [rpm]',
    'Torque [Nm]',
    'Tool wear [min]',
    'Power',
    'Temperature difference'
]]
data.describe(include='all')
display(data)
Machine failure Type Air temperature [K] Process temperature [K] Rotational speed [rpm] Torque [Nm] Tool wear [min] Power Temperature difference
0 0 M 298.1 308.6 1551.0 42.8 0.0 66382.8 10.5
1 0 L 298.2 308.7 1408.0 46.3 3.0 65190.4 10.5
2 0 L 298.1 308.5 1498.0 49.4 5.0 74001.2 10.4
3 0 L 298.2 308.6 1433.0 39.5 7.0 56603.5 10.4
4 0 L 298.2 308.7 1408.0 40.0 9.0 56320.0 10.5
... ... ... ... ... ... ... ... ... ...
9995 0 M 298.8 308.4 1604.0 29.5 14.0 47318.0 9.6
9996 0 H 298.9 308.4 1632.0 31.8 17.0 51897.6 9.5
9997 0 M 299.0 308.6 1645.0 33.4 22.0 54943.0 9.6
9998 0 H 299.0 308.7 1408.0 48.5 25.0 68288.0 9.7
9999 0 M 299.0 308.7 1500.0 40.2 30.0 60300.0 9.7

10000 rows × 9 columns

In [ ]:
data.head()
Out[ ]:
Machine failure Type Air temperature [K] Process temperature [K] Rotational speed [rpm] Torque [Nm] Tool wear [min] Power Temperature difference
0 0 M 298.1 308.6 1551.0 42.8 0.0 66382.8 10.5
1 0 L 298.2 308.7 1408.0 46.3 3.0 65190.4 10.5
2 0 L 298.1 308.5 1498.0 49.4 5.0 74001.2 10.4
3 0 L 298.2 308.6 1433.0 39.5 7.0 56603.5 10.4
4 0 L 298.2 308.7 1408.0 40.0 9.0 56320.0 10.5
In [ ]:
data.shape
Out[ ]:
(10000, 9)
In [ ]:
from ydata_profiling import ProfileReport

profile = ProfileReport(data, title="Pandas Profiling Report")
In [ ]:
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[ ]:

In [ ]:
data.describe().T
Out[ ]:
count mean std min 25% 50% 75% max
Machine failure 10000.0 0.09900 0.561988 0.0 0.0 0.0 0.00 5.0
Air temperature [K] 10000.0 300.00493 2.000259 295.3 298.3 300.1 301.50 304.5
Process temperature [K] 10000.0 310.00556 1.483734 305.7 308.8 310.1 311.10 313.8
Rotational speed [rpm] 10000.0 1538.77610 179.284096 1168.0 1423.0 1503.0 1612.00 2886.0
Torque [Nm] 10000.0 39.98691 9.968934 3.8 33.2 40.1 46.80 76.6
Tool wear [min] 10000.0 107.95100 63.654147 0.0 53.0 108.0 162.00 253.0
Power 10000.0 59967.14704 10193.093881 10966.8 53105.4 59883.9 66873.75 99980.4
Temperature difference 10000.0 10.00063 1.001094 7.6 9.3 9.8 11.00 12.1
In [ ]:
data.select_dtypes(include=['object']).describe().T
Out[ ]:
count unique top freq
Type 10000 3 L 6000
In [ ]:
data.columns
Out[ ]:
Index(['Machine failure', 'Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Power', 'Temperature difference'],
      dtype='object')
In [ ]:
num_cols = ['Air temperature [K]', 'Process temperature [K]',
            'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
cat_cols = ['Type', 'Failure type']
label = 'Machine failure'
In [ ]:
data['Air temperature [K]'][data['Air temperature [K]'].isna().astype(int) == 1]
Out[ ]:
Series([], Name: Air temperature [K], dtype: float64)
In [ ]:
plt.figure(figsize=(12, 12))
for i, col in enumerate(num_cols):
    plt.subplot(3, 2, i+1)
    sns.histplot(data, x=col, kde=True, alpha=0.2, color='red', bins=15)
    plt.title(col)
plt.suptitle("Data Distributions", fontsize=15)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
plt.figure(figsize=(10, 7))
for i, col in enumerate(num_cols):
    plt.subplot(2, 3, i+1)
    sns.rugplot(data, x=col, hue=label, height=0.1)
    sns.boxplot(data, x=col, width=0.25)
plt.suptitle("Data Distributions")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
plt.figure(figsize=(10, 7))
for i, col in enumerate(num_cols):
    plt.subplot(2, 3, i+1)
    sns.boxplot(data, x=label, y=col, width=0.5)
plt.suptitle("Data Distribution in Relation to Machine Failure")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
#plt.figure(figsize = (6,6))
# , square = True, annot = True, cmap = 'Blues', linewidths = 0.5)
sns.heatmap(data[num_cols].corr(), annot=True, fmt=".2f")
plt.title("Heatmap Analysis")
plt.show()
No description has been provided for this image
In [ ]:
data[num_cols].corr()
Out[ ]:
Air temperature [K] Process temperature [K] Rotational speed [rpm] Torque [Nm] Tool wear [min]
Air temperature [K] 1.000000 0.876107 0.022670 -0.013778 0.013853
Process temperature [K] 0.876107 1.000000 0.019277 -0.014061 0.013488
Rotational speed [rpm] 0.022670 0.019277 1.000000 -0.875027 0.000223
Torque [Nm] -0.013778 -0.014061 -0.875027 1.000000 -0.003093
Tool wear [min] 0.013853 0.013488 0.000223 -0.003093 1.000000
In [ ]:
data.plot.hexbin(x='Air temperature [K]', y='Process temperature [K]',
                 gridsize=20, cmap='Purples', figsize=(5, 4))
plt.title("Hexbin Plot Between Process Temperature and Air Temperature")
plt.show()
No description has been provided for this image
In [ ]:
data.plot.hexbin(x='Rotational speed [rpm]', y='Torque [Nm]',
                 gridsize=30, cmap='Purples', figsize=(5, 4))
plt.title("Hexbin Plot Between Torque and Rotational speed")
plt.show()
No description has been provided for this image
In [ ]:
type_machine_failure = data[['Type', 'Machine failure']].pivot_table(index='Type', columns='Machine failure', aggfunc= lambda x: len(x), margins = True)
print(type_machine_failure)
plt.figure(figsize=(6,6))
sns.heatmap(type_machine_failure, annot=True, fmt='g', cmap='Blues', cbar=False, linewidths=0.5)
plt.title("Type vs Machine Failure")
plt.show()
Machine failure     0   1    2   3   4   5    All
Type                                             
H                 979   6    8   4   2   4   1003
L                5757  24   68  51  87  13   6000
M                2916  12   30  28   9   2   2997
All              9652  42  106  83  98  19  10000
No description has been provided for this image
In [ ]:
from pycaret.classification import *
s = setup(data, target = 'Machine failure', session_id = 42, data_split_stratify=True)
In [ ]:
best_model = compare_models(sort = 'AUC')
Initiated . . . . . . . . . . . . . . . . . . 12:06:02
Status . . . . . . . . . . . . . . . . . . Loading Dependencies
Estimator . . . . . . . . . . . . . . . . . . Compiling Library
In [ ]:
plt.figure(figsize = (4,3))
plot_model(best_model, plot = 'confusion_matrix')
No description has been provided for this image
In [ ]:
plt.figure(figsize = (5,4))
plot_model(best_model, plot = 'auc')
No description has been provided for this image
In [ ]:
plt.figure(figsize = (5, 4))
plot_model(best_model, plot = 'learning')
No description has been provided for this image
In [ ]:
plot_model(best_model, plot = 'feature')
No description has been provided for this image
In [ ]:
save_model(best_model, "ai4i2020_pycaret_model")
Transformation Pipeline and Model Successfully Saved
Out[ ]:
(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['Air temperature [K]',
                                              'Process temperature [K]',
                                              'Rotational speed [rpm]',
                                              'Torque [Nm]', 'Tool wear [min]',
                                              'Power',
                                              'Temperature difference'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=n...
                  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                         class_weight=None, criterion='gini',
                                         max_depth=None, max_features='sqrt',
                                         max_leaf_nodes=None, max_samples=None,
                                         min_impurity_decrease=0.0,
                                         min_samples_leaf=1, min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         monotonic_cst=None, n_estimators=100,
                                         n_jobs=-1, oob_score=False,
                                         random_state=42, verbose=0,
                                         warm_start=False))],
          verbose=False),
 'ai4i2020_pycaret_model.pkl')
In [ ]:
#plot_model(best_model, plot = 'calibration') # NOT WORKING
In [ ]:
calibrated_model = calibrate_model(best_model)
In [ ]:
#plot_model(calibrated_model, plot = 'calibration') # NOT WORKING
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[65], line 1
----> 1 plot_model(calibrated_model, plot = 'calibration') # NOT WORKING

File c:\ProgramData\anaconda3\Lib\site-packages\pycaret\utils\generic.py:964, in check_if_global_is_not_none.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    962     if globals_d[name] is None:
    963         raise ValueError(message)
--> 964 return func(*args, **kwargs)

File c:\ProgramData\anaconda3\Lib\site-packages\pycaret\classification\functional.py:1725, in plot_model(estimator, plot, scale, save, fold, fit_kwargs, plot_kwargs, groups, verbose, display_format)
   1611 @check_if_global_is_not_none(globals(), _CURRENT_EXPERIMENT_DECORATOR_DICT)
   1612 def plot_model(
   1613     estimator,
   (...)
   1622     display_format: Optional[str] = None,
   1623 ) -> Optional[str]:
   1624     """
   1625     This function analyzes the performance of a trained model on holdout set.
   1626     It may require re-training the model in certain cases.
   (...)
   1722 
   1723     """
-> 1725     return _CURRENT_EXPERIMENT.plot_model(
   1726         estimator=estimator,
   1727         plot=plot,
   1728         scale=scale,
   1729         save=save,
   1730         fold=fold,
   1731         fit_kwargs=fit_kwargs,
   1732         plot_kwargs=plot_kwargs,
   1733         groups=groups,
   1734         verbose=verbose,
   1735         display_format=display_format,
   1736     )

File c:\ProgramData\anaconda3\Lib\site-packages\pycaret\classification\oop.py:2071, in ClassificationExperiment.plot_model(self, estimator, plot, scale, save, fold, fit_kwargs, plot_kwargs, groups, verbose, display_format)
   1957 def plot_model(
   1958     self,
   1959     estimator,
   (...)
   1968     display_format: Optional[str] = None,
   1969 ) -> Optional[str]:
   1970     """
   1971     This function analyzes the performance of a trained model on holdout set.
   1972     It may require re-training the model in certain cases.
   (...)
   2068 
   2069     """
-> 2071     return super().plot_model(
   2072         estimator=estimator,
   2073         plot=plot,
   2074         scale=scale,
   2075         save=save,
   2076         fold=fold,
   2077         fit_kwargs=fit_kwargs,
   2078         plot_kwargs=plot_kwargs,
   2079         groups=groups,
   2080         verbose=verbose,
   2081         display_format=display_format,
   2082     )

File c:\ProgramData\anaconda3\Lib\site-packages\pycaret\internal\pycaret_experiment\tabular_experiment.py:2045, in _TabularExperiment.plot_model(self, estimator, plot, scale, save, fold, fit_kwargs, plot_kwargs, groups, feature_name, label, verbose, display_format)
   1933 def plot_model(
   1934     self,
   1935     estimator,
   (...)
   1946     display_format: Optional[str] = None,
   1947 ) -> Optional[str]:
   1948     """
   1949     This function takes a trained model object and returns a plot based on the
   1950     test / hold-out set. The process may require the model to be re-trained in
   (...)
   2043 
   2044     """
-> 2045     return self._plot_model(
   2046         estimator=estimator,
   2047         plot=plot,
   2048         scale=scale,
   2049         save=save,
   2050         fold=fold,
   2051         fit_kwargs=fit_kwargs,
   2052         plot_kwargs=plot_kwargs,
   2053         groups=groups,
   2054         feature_name=feature_name,
   2055         label=label,
   2056         verbose=verbose,
   2057         display_format=display_format,
   2058     )

File c:\ProgramData\anaconda3\Lib\site-packages\pycaret\internal\pycaret_experiment\tabular_experiment.py:427, in _TabularExperiment._plot_model(self, estimator, plot, scale, save, fold, fit_kwargs, plot_kwargs, groups, feature_name, label, verbose, system, display, display_format)
    425 if self.is_multiclass:
    426     if plot in multiclass_not_available:
--> 427         raise ValueError(
    428             "Plot Not Available for multiclass problems. Please see docstring for list of available Plots."
    429         )
    431 # exception for CatBoost
    432 # if "CatBoostClassifier" in str(type(estimator)):
    433 #    raise ValueError(
   (...)
    436 
    437 # checking for auc plot
    438 if not hasattr(estimator, "predict_proba") and plot == "auc":

ValueError: Plot Not Available for multiclass problems. Please see docstring for list of available Plots.
In [ ]:
automl()
Out[ ]:
CalibratedClassifierCV(cv=5, ensemble=True,
                       estimator=RandomForestClassifier(bootstrap=True,
                                                        ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features='sqrt',
                                                        max_leaf_nodes=None,
                                                        max_samples=None,
                                                        min_impurity_decrease=0.0,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        monotonic_cst=None,
                                                        n_estimators=100,
                                                        n_jobs=-1,
                                                        oob_score=False,
                                                        random_state=42,
                                                        verbose=0,
                                                        warm_start=False),
                       method='sigmoid', n_jobs=None)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CalibratedClassifierCV(cv=5, ensemble=True,
                       estimator=RandomForestClassifier(bootstrap=True,
                                                        ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features='sqrt',
                                                        max_leaf_nodes=None,
                                                        max_samples=None,
                                                        min_impurity_decrease=0.0,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        monotonic_cst=None,
                                                        n_estimators=100,
                                                        n_jobs=-1,
                                                        oob_score=False,
                                                        random_state=42,
                                                        verbose=0,
                                                        warm_start=False),
                       method='sigmoid', n_jobs=None)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       monotonic_cst=None, n_estimators=100, n_jobs=-1,
                       oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
RandomForestClassifier(n_jobs=-1, random_state=42)
In [ ]:
#create_app(best_model)